import csv
import pandas as pd
# Clean data
#
# From https://en.wikipedia.org/wiki/List_of_public_corporations_by_market_capitalization
# Data Source : https://docs.google.com/spreadsheets/d/1NxEouZb0AHGZ3OcwUPp8Ga1LmBjp37NABtwdcYSE91I/edit#gid=115560802
# Visualization
# http://blockbuilder.org/clemsos/4923d7601f5e4b208b740123679047f7
# parse fist CSV
first = "raw/Data - List of public corporations by market capitalization - 2007-2006.csv"
df = pd.read_csv(first)
df.dropna(how="all", inplace=True) # drop empty lines
_id = df["Year"].astype(int).map(str) + "-" + df["Q"]+"-"+df["Rank"].astype(int).map(str)
q = df["Name"].str.split(' ', expand=True)
final_first = pd.concat([_id, q, df], axis=1, join='inner')
final_first = final_first.drop(['Name'], axis=1)
# rename columns
final_first.columns.values[0] = 'Id'
final_first.columns.values[1] = 'Company'
final_first.columns.values[2] = 'Company Link'
final_first = final_first.rename(index=str, columns={"Q": "Quarter"})
final_first["Headquarters Link"] = None
final_first.head()
print final_first.shape
# parse properly second CSV
second = "raw/Data - List of public corporations by market capitalization - 2008-2017.csv"
df = pd.read_csv(second)
df.dropna(how="all", inplace=True) # drop empty lines
print df.shape
col_names = [
('Q1', 'Q1 (Country)'),
('Q2', 'Q2 (Country)'),
('Q3', 'Q3 (Country'),
('Q4', 'Q4 (Country)')
]
final_second = pd.DataFrame()
for cols in col_names:
_id = df["Year"].astype(int).map(str) + "-" + cols[0]+"-"+df["Rank"].astype(int).map(str)
q = df[cols[0]].str.split(' ', expand=True)
q.columns = ["Company", "Company Link", "Market value", "Notes"]
result = pd.concat([_id, q], axis=1, join='inner')
result.columns.values[0] = 'Id'
result["Year"] = df["Year"].astype(int)
result["Quarter"]= cols[0] # add quarter
result["Rank"] = df["Rank"].astype(int)
result["Headquarters"] = df[cols[1]].str[1:-1].str.split('/', expand=True)[4]
result["Headquarters Link"] = df[cols[1]]
result["Primary industry"] = None
print result.shape
final_second = final_second.append(result)
print final_second.shape
final_second.head()
# fetch missing values from Wikipedia
import os
import json
import wptools
from slugify import slugify
# get unique values
companies = list(final_first.append(final_second)['Company'].unique())
print "%s companies"%len(companies)
# Count files in folder
# print len(os.listdir('./companies'))
companies_info = {}
for company_name in companies:
if type(company_name) is str :
json_file_name = "companies/%s.json"%slugify(company_name.decode('utf-8'))
if not os.path.exists(json_file_name):
print "fetching %s..."%company_name
# fetch infobox
page = wptools.page(company_name).get_parse()
infobox = page.data['infobox']
#save infobox as json file
print json_file_name
with open(json_file_name, 'w') as outfile:
json.dump(dict(infobox), outfile)
print "saved to %s"%json_file_name
print
else :
with open(json_file_name, 'r') as outfile:
companies_info[company_name] = json.load(outfile)
print "Fetched info about %s companies."%len(companies_info.keys())
# sort by categories
# Plateformes numériques : Conglomerate (Alphabet + Tencent) Internet (alibaba), Facebook, Amazon, Apple (>juillet 2008), Microsoft (>juin 2009)
# Technologie de l’information : Cloud computing (IBM), Computer software, Computer hardware, Consumer electronics, Apple (<juillet 2008), Microsoft (<juin 2009)
# Banque & finance: Banking, Conglomerate (Berskshire), Financial services
# Energie : Energy industry, Oil and gas,
# Médical & Pharma : Pharmaceutical industry, Medical equipment
# Télécom : Telecommunications
# Distribution : Retail, Online shopping, Fast-moving consumer goods (1 seul : Procter & Gamble)
# Equipements et resources : Conglomerate (GE), Automotive industry, Food processing, Metals,
# parse industry for each result
industries = {}
french_names = {
"Plateformes numériques" : ["Internet", "Cloud computing", ],
"Technologie de l’information" : ["Computer software", "Computer hardware", "Consumer electronics"],
"Banque & Finance" : ["Bank", "Banking", "Financial services"],
"Energie" : ["Energy industry", "Oil and gas", "Petroleum industry", "List of petroleum companies", "Oil and gas industry"],
"Médical & Pharma" : ["Pharmaceutical industry", "Medical equipment"],
"Distribution" : ["Retail", "Online shopping", "Fast-moving consumer goods"],
"Télécom" : ["Telecommunication", "Telecommunications"],
"Equipements & Ressources" : ["Automotive industry", "Food processing", "Metals"]
}
translations = {}
for fr in french_names:
for name in french_names[fr] :
translations[name] = fr
# print translations
for company_name in companies_info :
info = companies_info[company_name]
s = info[u'industry']
industry = s[s.find("[[")+1:s.find("]]")][1:].split("|")[0]
if industry == "Conglomerate (company)" or industry == "Conglomerate (company)":
if company_name == "Berkshire Hathaway":
industries[company_name] = "Banque & Finance"
elif company_name == "General Electric":
industries[company_name] = "Equipements & Ressources"
elif company_name == "Alphabet Inc.":
industries[company_name] = "Plateformes numériques"
elif company_name == "Tencent":
industries[company_name] = "Plateformes numériques"
else :
if company_name == "Amazon.com":
industries[company_name] = "Plateformes numériques"
else :
industries[company_name] = translations[industry]
# print company_name
# print industries.keys()
print 'Industries parsed.'
# print industries
# merge both
final = final_first.append(final_second)
print final.shape
# remap all values using updated data
final["Primary industry"] = final["Company"]
final = final.replace({ "Primary industry" : industries })
# print final.replace({ "Primary industry" : industries })
# move Apple and Microsoft after 2016
final.ix[ (final['Company'] == "Apple Inc.") & (final['Year'] > 2008), "Primary industry"] = "Plateformes numériques"
final.ix[(final['Company'] == "Microsoft") & (final['Year'] > 2008), "Primary industry"] = "Plateformes numériques"
# some final clean
final = final.drop(["Notes", "Headquarters Link"], axis=1)
final["Market value"] = final["Market value"].str.split('[', expand=True)[0] # remove
final["Company Link"] = final["Company Link"].str[1:-1]
# final["Primary industry"] = final["Primary industry"].str.split(' ', expand=True)[0] # remove
final.head()
# save as file
final.to_csv("Data - List of public corporations by market capitalization - FINAL.csv", index=False)
print 'results saved to "./final.csv"'
final.head()
# sort result by year/quarter/industry
results_by_quarter = {}
# get all results by quarter
for index, row in final.sort_values("Id").iterrows():
quarter = "%s-%s"%(row["Year"], row["Quarter"])
try :
results_by_quarter[quarter].append((row["Primary industry"], row["Market value"]))
except :
results_by_quarter[quarter] = [ (row["Primary industry"], row["Market value"]) ]
# sum up results by industry
final_results = {}
for q in results_by_quarter:
q_results = {}
for res in results_by_quarter[q]:
if res[1] is not None:
industry = res[0]
amount = float(''.join(res[1].split(",")))
try :
q_results[industry] = q_results[industry]+amount
except KeyError:
q_results[industry] = amount
final_results[q] = q_results
# make percent
# total = sum([q_results[qr] for qr in q_results])
# final_results[q] = { qr : q_results[qr]/total*100 for qr in q_results}
df = pd.DataFrame(final_results)
df.to_csv("Data - Market capitalization by Industries.csv")
print 'results saved.'
df.head()